rm(list=ls())
library(ezids)
library(ggplot2)
library(dplyr)
library(readr)
library(tidyverse)
library (tidyr)
library(janitor)
library(scales)
library(ggrepel)
library(corrplot)
This project uses the State and County Housing Market Indicators dataset from the American Enterprise Institute Housing Center, found here. The variables are:
| Original Variable Name | New Variable Name | Definition |
|---|---|---|
| State | State | state |
| County | County_Name | County |
| FIPS | FIPS_County_Code | 5-digit Federal Information Processing Series codes (first 2 digits indicate state, last 3 indicate sub-county entity) |
| Year | Year | Year when the data was collected |
| Tier | Affordability | Categorizes home sales into entry-level (<=80th percentile of FHA sales prices), move-up (all others), and all |
| Median.Sale.Price..in.Thousands. | Median_Sale_Price_in_k | Median sale price in thousands of USD per county |
| House.Price.Appreciation.since.2012 | House_Price_Appreciation_since_2012_percent | Cumulative home price appreciation since 2012 |
| House.Price.Appreciation..Year.over.Year | House_Price_Appreciation_yr_over_yr_percent | Home price appreciation since the previous year |
| Months..Supply | Months_Supply | Number of months it would take for the inventory of existing homes for sale to be exhausted at the current sales pace |
| New.Construction.Share.of.Sales | New_Constr_by_share_of_sales_percent | Percent of sales comprising new construction |
| Mortgage.Default.Rate | Mortgage_Default_Rate_percent | AEI Mortgage Default Rate, a measure of how loans originating in a given month would perform under the same conditions as the 2007 financial crisis (<=7%: Low Risk; between 7.01% and 14%: Medium Risk; >14%: High Risk) |
housing = read.csv("/Users/ilgazkuscu/Documents/GitHub/housing-price-vs-supply-2024/Data/state_county_data_download_2025.csv")
housing %>% slice_sample(n=5)
## State County FIPS Year Tier Median.Sale.Price..in.Thousands.
## 1 IA Webster County 19187 2024 all $126
## 2 NH Coos County 33007 2015 entrylevel $64
## 3 GA Butts County 13035 2016 entrylevel $100
## 4 KY Warren County 21227 2014 moveup $257
## 5 WI Adams County 55001 2017 entrylevel $70
## House.Price.Appreciation.since.2012 House.Price.Appreciation..Year.over.Year.
## 1 85.40% 4.10%
## 2 3.50% 5.80%
## 3 31.80% 7.80%
## 4 4.60% 2.90%
## 5 9.50% 5.30%
## Months..Supply New.Construction.Share.of.Sales Mortgage.Default.Rate
## 1 2.7 1.60% 17.30%
## 2 12.3 1.70% 13.30%
## 3 1.6 7.10% 26.10%
## 4 7.9 18.40% 10.40%
## 5 6.0 0.90%
The data is limited to the year 2024 and cleaned of NA values, and the variables are renamed for clarity.
housing_2024 = housing %>% filter(housing$Year == 2024, housing$State != 'AA National') %>% na.omit %>%
#rename cols
rename(
Median_Sale_Price_per_k = Median.Sale.Price..in.Thousands.,
House_Price_Appreciation_yr_over_yr_percent = House.Price.Appreciation..Year.over.Year.,
House_Price_Appreciation_since_2012_percent = House.Price.Appreciation.since.2012,
Months_Supply = Months..Supply,
New_Constr_by_share_of_sales_percent = New.Construction.Share.of.Sales,
Mortgage_Default_Rate_percent = Mortgage.Default.Rate,
County_Name = County,
FIPS_County_Code = FIPS,
Affordability = Tier
)
head(housing_2024)
## State County_Name FIPS_County_Code Year Affordability
## 1 AK AA State 0 2024 all
## 2 AK AA State 0 2024 entrylevel
## 3 AK AA State 0 2024 moveup
## 7 AK Aleutians West Census Area 2016 2024 all
## 8 AK Aleutians West Census Area 2016 2024 entrylevel
## 9 AK Aleutians West Census Area 2016 2024 moveup
## Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 1 $394 59.60%
## 2 $314 59.90%
## 3 $564 57.40%
## 7
## 8
## 9
## House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 1 5.40% 3.2
## 2 5.60% 2.7
## 3 5.00% 4.2
## 7 1.8
## 8 2.8
## 9 1.6
## New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 1 6.30% 12.50%
## 2 4.60% 13.50%
## 3 9.50% 10.20%
## 7
## 8
## 9
The typing of the variables is also corrected. Some require the symbols “$” and “%” to be removed beforehand, so that is also done.
# as factors
housing_2024$State = as.factor(housing_2024$State)
housing_2024$County_Name = as.factor(housing_2024$County_Name)
housing_2024$FIPS_County_Code = as.factor(housing_2024$FIPS_County_Code)
housing_2024$Affordability = as.factor(housing_2024$Affordability)
# remove prefixes '$' and '%' from values
housing_2024 = housing_2024 %>%
mutate(Median_Sale_Price_per_k = gsub("\\$", "", Median_Sale_Price_per_k),
House_Price_Appreciation_since_2012_percent =
gsub("%","",House_Price_Appreciation_since_2012_percent),
House_Price_Appreciation_yr_over_yr_percent =
gsub("%","",House_Price_Appreciation_yr_over_yr_percent),
New_Constr_by_share_of_sales_percent = gsub("%","",New_Constr_by_share_of_sales_percent),
Mortgage_Default_Rate_percent = gsub("%","",Mortgage_Default_Rate_percent)
)
# as num instead of chr
housing_2024$Median_Sale_Price_per_k = as.numeric(housing_2024$Median_Sale_Price_per_k)
housing_2024$House_Price_Appreciation_since_2012_percent =
as.numeric(housing_2024$House_Price_Appreciation_since_2012_percent)
housing_2024$House_Price_Appreciation_yr_over_yr_percent =
as.numeric(housing_2024$House_Price_Appreciation_yr_over_yr_percent)
housing_2024$New_Constr_by_share_of_sales_percent =
as.numeric(housing_2024$New_Constr_by_share_of_sales_percent)
housing_2024$Mortgage_Default_Rate_percent = as.numeric(housing_2024$Mortgage_Default_Rate_percent)
# For some reason is rounding the data in quite a weird way—inaccurately
# view data
housing_2024 %>% slice_sample(n=5)
## State County_Name FIPS_County_Code Year Affordability
## 1 MI Mackinac County 26097 2024 entrylevel
## 2 MO Ripley County 29181 2024 all
## 3 OH Muskingum County 39119 2024 all
## 4 AR Calhoun County 5013 2024 moveup
## 5 AL Cullman County 1043 2024 moveup
## Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 1 NA NA
## 2 161 115
## 3 167 114
## 4 253 NA
## 5 410 NA
## House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 1 NA 5.9
## 2 9.3 2.1
## 3 6.1 2.2
## 4 NA 5.3
## 5 NA 7.4
## New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 1 NA NA
## 2 5.2 17.2
## 3 2.6 17.1
## 4 0.0 16.9
## 5 27.4 10.0
# def=
# {
# Low: all sales below the 40th percentile of FHA sales prices
# Low-medium: all sales at or below the 80th percentile of FHA sales prices
# Medium-high: all sales at or below 125% of the GSE loan limit
# High: all other sales
# entry-level: low and low-medium price tiers
# move-up: medium-high and high price tiers
# }
xkablesummary(housing_2024)
| State | County_Name | FIPS_County_Code | Year | Affordability | Median_Sale_Price_per_k | House_Price_Appreciation_since_2012_percent | House_Price_Appreciation_yr_over_yr_percent | Months_Supply | New_Constr_by_share_of_sales_percent | Mortgage_Default_Rate_percent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | TX : 733 | AA State : 153 | 0 : 153 | Min. :2024 | all :3097 | Min. : 14 | Min. : 3.7 | Min. :-32.90 | Min. : 0.00 | Min. : 0.0 | Min. : 0.2 |
| Q1 | GA : 480 | Washington County: 87 | 1001 : 3 | 1st Qu.:2024 | entrylevel:3094 | 1st Qu.:150 | 1st Qu.: 86.6 | 1st Qu.: 2.80 | 1st Qu.: 2.20 | 1st Qu.: 2.7 | 1st Qu.:10.2 |
| Median | KY : 363 | Jefferson County : 74 | 1003 : 3 | Median :2024 | moveup :3047 | Median :260 | Median :105.2 | Median : 5.70 | Median : 3.20 | Median : 6.9 | Median :13.9 |
| Mean | MO : 337 | Franklin County : 72 | 1005 : 3 | Mean :2024 | NA | Mean :286 | Mean :110.8 | Mean : 5.87 | Mean : 4.36 | Mean : 10.7 | Mean :14.2 |
| Q3 | VA : 326 | Jackson County : 68 | 1007 : 3 | 3rd Qu.:2024 | NA | 3rd Qu.:390 | 3rd Qu.:129.7 | 3rd Qu.: 8.60 | 3rd Qu.: 5.20 | 3rd Qu.: 15.0 | 3rd Qu.:17.4 |
| Max | IL : 307 | Lincoln County : 66 | 1009 : 3 | Max. :2024 | NA | Max. :999 | Max. :279.1 | Max. : 89.20 | Max. :24.00 | Max. :100.0 | Max. :36.0 |
| NA | (Other):6692 | (Other) :8718 | (Other):9070 | NA | NA | NA’s :992 | NA’s :2177 | NA’s :2204 | NA | NA’s :859 | NA’s :1963 |
ggplot(housing_2024, aes(x = reorder(State, -Median_Sale_Price_per_k, median),
y = Median_Sale_Price_per_k)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
coord_flip() +
labs(
title = "Distribution of Median Sale Prices by State (2024)",
x = "State",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
There are too many to be particularly useful. You can see general trends, but I am going to run this with a smaller state sample.
top_states <- housing_2024 %>%
dplyr::count(State, sort = TRUE) %>%
dplyr::slice_max(n, n = 5)
bottom_states <- housing_2024 %>%
dplyr::count(State, sort = TRUE) %>%
dplyr::slice_min(n, n = 5)
# merge top and bottom states
housing_compare <- housing_2024 %>%
filter(State %in% c(top_states$State, bottom_states$State)) %>%
mutate(StateGroup = case_when(
State %in% top_states$State ~ "States with the Most Houses",
State %in% bottom_states$State ~ "States with the Fewest Houses"
))%>%
mutate(StateGroup = factor(StateGroup, levels = c(
"States with the Most Houses",
"States with the Fewest Houses"
)))
#plot top bottom comparison ####
ggplot(housing_compare, aes(x = State, y = Median_Sale_Price_per_k, fill = StateGroup)) +
geom_boxplot() +
facet_wrap(~ StateGroup, scales = "free_x") +
labs(
title = "Median Sale Price in States with Most vs Least Housing Records",
subtitle = "The Median Sale Price in States with a Larger Supply of Houses is Significantly Lower\nthan States with a Smaller Supply of Houses",
x = "State",
y = "Median Sale Price (in thousands)"
) +
theme_minimal() +
scale_fill_manual(values = c("States with the Most Houses" = "skyblue", "States with the Fewest Houses" = "red"))
ggplot(housing_2024, aes(x = Median_Sale_Price_per_k)) +
geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
labs(
title = "Distribution of Median Sale Prices (2024)",
x = "Median Sale Price (in thousands)",
y = "Count"
) +
theme_minimal()
#needs a subtitle with commentary, just remembered all need a caption with the source named, and we can prob add other elements to this, because otherwise it seems to obvi to me
ggplot(housing_2024, aes(x = Affordability, y = Median_Sale_Price_per_k, fill = Affordability)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Affordability Tier (2024)",
x = "Affordability Tier",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
#no duh, there are more unpurchased expensive houses because people can't afford it
#not sure how useful this is, but maybe as a starting baseline
ggplot(housing_2024, aes(x = Months_Supply, y = Median_Sale_Price_per_k)) +
geom_point(aes(color = Affordability), alpha = 0.7) +
# geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Housing Supply vs Median Sale Price",
x = "Months of Supply",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
#this is a good one, needs a commentary subtitle and some cleaning
# Not at all what I would have expected. I wonder why this is?
housing_numeric <- housing_2024 %>%
select(where(is.numeric)) %>%
drop_na()
cor_matrix <- cor(housing_numeric)
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black", number.cex=0.5)
#interesting to see positive and inverse relationships between variables
#did not scan for no relationships
# Create a vector of colors for top states (blue shades) and bottom states (red shades)
top_states_colors <- scales::seq_gradient_pal("lightblue", "navy")(seq(0, 1, length.out = length(unique(housing_compare$State[housing_compare$StateGroup == "States with the Most Houses"]))))
names(top_states_colors) <- unique(housing_compare$State[housing_compare$StateGroup == "States with the Most Houses"])
bottom_states_colors <- scales::seq_gradient_pal("lightpink", "darkred")(seq(0, 1, length.out = length(unique(housing_compare$State[housing_compare$StateGroup == "States with the Fewest Houses"]))))
names(bottom_states_colors) <- unique(housing_compare$State[housing_compare$StateGroup == "States with the Fewest Houses"])
#last point in line for state label
label_points <- housing_compare %>%
group_by(State) %>%
filter(Months_Supply == max(Months_Supply, na.rm = TRUE)) %>%
ungroup()
# Combine into one color vector
state_colors <- c(top_states_colors, bottom_states_colors)
#all in gray with faceted compare in color with states labeled ####
ggplot() +
geom_point(data = housing_2024, aes(x = Months_Supply, y = Median_Sale_Price_per_k),
color = "gray70", alpha = 0.3, size = 1) +
# geom_path(data = housing_compare,
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# size = 1, alpha = 0.8) +
geom_point(data = housing_compare,
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
size = 2, alpha = 0.5) +
geom_text_repel(data = label_points,
aes(x = Months_Supply, y = Median_Sale_Price_per_k,
label = State, color=State),
size = 3.5, stroke=0.01, show.legend = FALSE) +
facet_wrap(~ StateGroup) +
scale_color_manual(values = state_colors) +
labs(
title = "Housing Supply vs Median Price by State with Grouped Colors",
subtitle = "Some commentary here",
x = "Months of Supply",
y = "Median Sale Price (in thousands)",
color = "State"
) +
theme_minimal()
#can't read state names, maybe find a way to make it stand out
ggplot() +
geom_point(data = housing_2024,
aes(x = Months_Supply, y = Median_Sale_Price_per_k),
color = "gray70", alpha = 0.3, size = 1) +
# geom_path(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# alpha = 0.8, size = 1) +
geom_point(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
alpha = 0.5, size = 2) +
# geom_path(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# alpha = 0.8, size = 1) +
geom_point(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
alpha = 0.5, size = 2) +
scale_color_manual(values = state_colors) +
labs(
title = "Housing Supply vs Median Price: All Counties with Highlights",
subtitle = "Gray points: All counties | Blue shades: States with Most Houses | Red shades: States with Fewest Houses",
x = "Months of Supply",
y = "Median Sale Price (in thousands)",
color = "State"
) +
theme_minimal()
housing_constr = housing_2024
housing_constr = housing_constr %>%
mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))
ggplot(housing_constr, aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Percent New Construction (2024)",
x = "New Construction by Share of Sales Percent",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
# Perhaps new construction indicates high demand, and as such, the more new construction, the higher the median sale price?
ggplot(filter(housing_constr,Affordability=="entrylevel"), aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Percent New Construction (2024)",
x = "New Construction by Share of Sales Percent",
y = "Median Sale Price (in thousands)"
) +
theme_minimal()
# Even more pronounced here